{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 导入数据包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "# @File  : 中文文本分类20200609-4分类版本.ipynb\n",
    "# @Author: 英俊\n",
    "# @Date  : 2020/6/9\n",
    "# @Software: jupyter notebook\n",
    "# 复旦大学语料文本分类\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "import jieba\n",
    "\n",
    "import os\n",
    "\n",
    "import pickle  # 持久化\n",
    "\n",
    "from numpy import *\n",
    "\n",
    "#数据提取\n",
    "import sklearn\n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfTransformer  # TF-IDF向量转换类\n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer  # TF_IDF向量生成类\n",
    "\n",
    "\n",
    "#模型建立\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier\n",
    "from sklearn.svm import SVC,LinearSVC\n",
    "from sklearn.naive_bayes import BernoulliNB,MultinomialNB,GaussianNB\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "import xgboost\n",
    "from xgboost import XGBClassifier\n",
    "# 集成模块\n",
    "from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier\n",
    "from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,AdaBoostRegressor\n",
    "from sklearn.model_selection import train_test_split\n",
    "# from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,AdaBoostRegressor\n",
    "\n",
    "#Pipeline 使用一系列 (key, value) 键值对来构建,其中 key 是你给这个步骤起的名字， value 是一个评估器对象:\n",
    "from sklearn.pipeline import Pipeline\n",
    "#准确率，精确率，召回率，f1\n",
    "from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report\n",
    "# 保存数据\n",
    "import joblib\n",
    "\n",
    "# 读取数据\n",
    "def readFile(path):\n",
    "    with open(path, 'r', errors='ignore') as file:  # 文档中编码有些问题，所有用errors过滤错误\n",
    "        content = file.read()\n",
    "        return content\n",
    "\n",
    "# 这块没用，供日后研究一下\n",
    "# 保存文件\n",
    "def saveFile(path, result):\n",
    "    with open(path, 'w', errors='ignore') as file:\n",
    "        file.write(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import jieba\n",
    "\n",
    "import os\n",
    "\n",
    "import pickle  # 持久化\n",
    "\n",
    "from numpy import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_data():\n",
    "    resultList=[]\n",
    "    labeList=[]\n",
    "    inputPath=\"./data/\"\n",
    "    fatherLists = os.listdir(inputPath)  # 主目录\n",
    "    for eachDir in fatherLists:  # 遍历主目录中各个文件夹\n",
    "        print(eachDir)\n",
    "        eachPath = inputPath + eachDir + \"/\"  # 保存主目录中每个文件夹目录，便于遍历二级文件\n",
    "        print(eachPath)\n",
    "        childLists = os.listdir(eachPath)  # 获取每个文件夹中的各个文件\n",
    "    #     print(eachPath)\n",
    "    #     each_resultPath = resultPath + eachDir + \"/\"  # 分词结果文件存入的目录\n",
    "        for eachFile in childLists:  # 遍历每个文件夹中的子文件\n",
    "                    eachPathFile = eachPath + eachFile  # 获得每个文件路径\n",
    "                    content = readFile(eachPathFile)  # 调用上面函数读取内容\n",
    "                    result = (str(content)).replace(\"\\r\\n\", \"\").strip()  # 删除多余空行与空格\n",
    "                    result = content.replace(\"\\r\\n\",\"\").strip()\n",
    "                    labeList.append(eachDir)\n",
    "                    resultList.append(result)\n",
    "                    cutResult = jieba.cut(result)  # 默认方式分词，分词结果用空格隔开\n",
    "    return labeList,resultList"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 查看获取的标签与文本是否完整"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C3-Art\n",
      "./data/C3-Art/\n",
      "C4-Literature\n",
      "./data/C4-Literature/\n",
      "C5-Education\n",
      "./data/C5-Education/\n",
      "C6-Philosophy\n",
      "./data/C6-Philosophy/\n"
     ]
    }
   ],
   "source": [
    "labeList,resultList=get_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "825\n",
      "825\n"
     ]
    }
   ],
   "source": [
    "print(len(labeList))\n",
    "print(len(resultList))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## 创建pandas对象进行查看"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>result</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】1-2340\\n【原文出处】中国图书评论\\n【原刊地名】沈阳\\n【原刊期号】1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】1-2681\\n【原文出处】武汉大学学报：哲社版\\n【原刊期号】199605\\...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】2-4116\\n【原文出处】云南学术探索\\n【原刊地名】昆明\\n【原刊期号】1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】1-3036\\n【原文出处】文艺理论研究\\n【原刊地名】沪\\n【原刊期号】19...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】1-3066\\n【原文出处】文学评论\\n【原刊地名】京\\n【原刊期号】1995...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    label                                             result\n",
       "0  C3-Art  【 文献号 】1-2340\\n【原文出处】中国图书评论\\n【原刊地名】沈阳\\n【原刊期号】1...\n",
       "1  C3-Art  【 文献号 】1-2681\\n【原文出处】武汉大学学报：哲社版\\n【原刊期号】199605\\...\n",
       "2  C3-Art  【 文献号 】2-4116\\n【原文出处】云南学术探索\\n【原刊地名】昆明\\n【原刊期号】1...\n",
       "3  C3-Art  【 文献号 】1-3036\\n【原文出处】文艺理论研究\\n【原刊地名】沪\\n【原刊期号】19...\n",
       "4  C3-Art  【 文献号 】1-3066\\n【原文出处】文学评论\\n【原刊地名】京\\n【原刊期号】1995..."
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dictTmp={ \"label\":labeList ,\"result\":resultList}\n",
    "df_all=pd.DataFrame(dictTmp)\n",
    "df_all.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## 查看数据类型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'【 文献号 】1-2340\\n【原文出处】中国图书评论\\n【原刊地名】沈阳\\n【原刊期号】199510\\n【原刊页号】61-62\\n【分 类 号】Z1\\n【分 类 名】出版工作、图书评介\\n【 作  者 】杨小民\\n【复印期号】199602\\n【 标  题 】图书评论应当重视对书籍装帧艺术的评价\\n【 正  文 】\\n    图书评论是近代报刊业兴起后，在世界各国得到长足发展的一种新型评论体裁。而不论是书评理论还是书评实践都有一个不小的疏漏，即忽\\n视了图书的形式因素。因为图书是内容与形式的综合体，忽视了“图书形式”这一重要方面，会导致在图书评论活动中忽视对图书的出版形式这\\n一重要方面的品评论述，而这对于出版物的达到基本要求：“形神俱佳”（“形”指书装艺术，“神”指内容叙述）或最高要求“尽善尽美”（\\n“尽善”指内容而言，“尽美”指形式而言）无疑是有缺憾的。\\n    图书的形式因素即为书籍的装帧设计艺术（以下简称“书装艺术”）。它的内容应当包括：封面、封底、书脊、环衬、扉页、字体、字号、\\n插图、版式、护封等。装帧设计应是图书中的重要内容，顺理成章地应成为书评文章中不可或缺的评论对象。然而，在当前报刊上大量刊登的书\\n评文章中谈及这一方面的极为少见。这一偏颇势必会对中国出版物综合水平的提高产生不良的影响。\\n    图书出版事业是人类的思维活动和精神成果与科学技术相结合的一项系统工程。而书装艺术则渗透着“出版人”的思维活动和印刷科技的水\\n平两个因素。设计者的艺术构思，通过印刷工艺的精心制作，与图书的内容达到协调一致，才形成一本精美的形神俱佳的图书。\\n    如今，我国的一些出版社，对图书的装帧设计重视不够，这既成为书评作者忽视书装艺术的评论的一个潜因，他们认为许多图书的书装艺术\\n不值一提或难以一说；同时，也人为地造成了对书装艺术粗糙现象的不合理宽容。究其原因，出版社不愿投入应有的资金和人力是主要问题。书\\n装艺术本身也是体现出版物品位高低的一项重要因素。在现代图书出版印刷中，应投入必要的资金，以避免参加国际图书博览会的中国图书再被\\n人们讥笑为“展翅高飞”、“鞠躬尽瘁”了。（由于纸质差，装订落后，我国图书陈列于国际展台时，暖气会使书册张开弯曲，这叫“展翅高飞\\n”；还有则为书脊软塌，不能直立，弯腰驼背，则称“鞠躬尽瘁”。）\\n    编辑素养的欠缺，也直接影响到书装艺术的优劣。在我国的出版业中，编辑通常是提供书装要求，并参与设计方案的。参与的前提，应该是\\n要具备一定的艺术素质和审美眼光，但如今有相当一部分编辑缺乏这一点。他们对艺术规律，对美术设计者从事的工作特性知之甚少，他们的参\\n与从某种意义上来说甚至成为一种盲目的干涉：“外行”指挥“内行”。大至约束个框子，小至书名作者的位置安放和颜色的指派。不难设想，\\n在这种缺乏平等探讨的格局下，要求所设计出来的封扉等的艺术效果将是什么样子。\\n    当然，提出这些问题，并不是反对文字编辑对美编工作的参与，而是希望各个出版社应在平时增加对书装艺术的知识的介绍和培训，以指导\\n编辑们以科学艺术的眼光来参与并审定书装设计方案，使我们的出版物真正成为内容与形式美和谐统一的精神产品。\\n    书评工作者本身的观念的局限是导致书评活动中忽视对书装艺术作出评价的一个重要性因素。\\n    书评不同于文艺评论。文艺评论是对文艺作品进行的学术界定。当前，书评文章中有种不良倾向——书评朝文艺评论方向发展。这就违背了\\n书评的宗旨，降低了书评本身的价值。仅仅注意抓框架结构，评内容主题，而忽略了外在形式因素。这种评论方式是不完整的，也是不科学的。\\n所以，书评人员应调整自己的书评观念，把书的内容与形式因素放到同等重要的地位（不否认因文而有主次之分），进行综合评论。唯其如此，\\n一篇完整而优秀的书评，才能使出版者、著作者、编辑者和读者多方面的获益。\\n    书装艺术既然是构成图书的有机组成部分，那么，缺少对书装艺术的评价就意味着书评工作的不完整。\\n    图书是精神和物质、内容和形式的综合体，是人类社会的精神产品。书装艺术是构成图书的重要组成部分，正如高斯先生在《出版审美论》\\n（1994年版）中所言：“图书的装帧设计，不仅为图书穿上一套美观的外衣，而且应该使图书的形式通过艺术构思、艺术手法而和内容统一起来\\n，反映出图书内容的美，反映出图书所蕴含的生命力的美。”\\n    “……一部图书的装帧设计，其审美价值虽然只属于个体，但个体的积累，却可以造成一个历史时期的出版事业的审美价值。”\\n    这些论述足以说明，装帧设计对于图书，除了形式美方面有其重要意义和作用外，更有在提高图书整体质量上的重要意义和重要作用。\\n    装帧设计本身，具有独特的艺术价值。同时，书装艺术也起着一种以艺术形式宣示图书内容的直观作用。图书进入流通领域，这种宣示既发\\n挥了一种无可替代的引导读者的作用，既给读者以美的鉴赏和启发，又引发了读者阅读的兴趣和购买的动机。这种社会价值超出了装帧设计艺术\\n价值本身的范围，而对整个图书市场起着不可忽视的调摄作用。\\n    当今世界，在图书出版领域，已形成三种以书装艺术风格来促销的流派：英国以庄重、豪华、大方为特征；日本为首的东方文化风格，以和\\n谐、含蕴、抒情见长；美国的现代派风格，以感官刺激为特征。这三者在图书营销上各有成效，在读者圈内有着广泛而深远的影响。哪一类图书\\n应该采取何种风格，所谓“量体裁衣”，因书制宜，是编辑工作者所应考虑的，也是书评工作者进行评论的依据。\\n                                    （本文责任编辑  韩忠良）＊'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_all[\"result\"][0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 数据清洗\n",
    "（根据不同的文本对象进行不同的处理，不要轻易加数据）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def remove_text(text):\n",
    "    text=text.replace('\\n','')\n",
    "    text=text.replace(' ','')\n",
    "    text=text.replace('【','')\n",
    "    text=text.replace('】','')\n",
    "    text=text.replace('（','')\n",
    "    text=text.replace('）','')\n",
    "    text=text.replace('、','')\n",
    "    text=text.replace('。','')\n",
    "    text=text.replace('；','')\n",
    "    text=text.replace('“','')\n",
    "    text=text.replace('”','')\n",
    "    text=text.replace('…','')\n",
    "    text=text.replace('，','')\n",
    "    text=text.replace('：','')\n",
    "    text=text.replace('《','')\n",
    "    text=text.replace('》','')\n",
    "    text=text.replace('[','')\n",
    "    text=text.replace(']','')\n",
    "    text=text.replace('—','')\n",
    "    return text"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## 查看清洗后的结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "文献号1-2340原文出处中国图书评论原刊地名沈阳原刊期号199510原刊页号61-62分类号Z1分类名出版工作图书评介作者杨小民复印期号199602标题图书评论应当重视对书籍装帧艺术的评价正文图书评论是近代报刊业兴起后在世界各国得到长足发展的一种新型评论体裁而不论是书评理论还是书评实践都有一个不小的疏漏即忽视了图书的形式因素因为图书是内容与形式的综合体忽视了图书形式这一重要方面会导致在图书评论活动中忽视对图书的出版形式这一重要方面的品评论述而这对于出版物的达到基本要求形神俱佳形指书装艺术神指内容叙述或最高要求尽善尽美尽善指内容而言尽美指形式而言无疑是有缺憾的图书的形式因素即为书籍的装帧设计艺术以下简称书装艺术它的内容应当包括封面封底书脊环衬扉页字体字号插图版式护封等装帧设计应是图书中的重要内容顺理成章地应成为书评文章中不可或缺的评论对象然而在当前报刊上大量刊登的书评文章中谈及这一方面的极为少见这一偏颇势必会对中国出版物综合水平的提高产生不良的影响图书出版事业是人类的思维活动和精神成果与科学技术相结合的一项系统工程而书装艺术则渗透着出版人的思维活动和印刷科技的水平两个因素设计者的艺术构思通过印刷工艺的精心制作与图书的内容达到协调一致才形成一本精美的形神俱佳的图书如今我国的一些出版社对图书的装帧设计重视不够这既成为书评作者忽视书装艺术的评论的一个潜因他们认为许多图书的书装艺术不值一提或难以一说同时也人为地造成了对书装艺术粗糙现象的不合理宽容究其原因出版社不愿投入应有的资金和人力是主要问题书装艺术本身也是体现出版物品位高低的一项重要因素在现代图书出版印刷中应投入必要的资金以避免参加国际图书博览会的中国图书再被人们讥笑为展翅高飞鞠躬尽瘁了由于纸质差装订落后我国图书陈列于国际展台时暖气会使书册张开弯曲这叫展翅高飞还有则为书脊软塌不能直立弯腰驼背则称鞠躬尽瘁编辑素养的欠缺也直接影响到书装艺术的优劣在我国的出版业中编辑通常是提供书装要求并参与设计方案的参与的前提应该是要具备一定的艺术素质和审美眼光但如今有相当一部分编辑缺乏这一点他们对艺术规律对美术设计者从事的工作特性知之甚少他们的参与从某种意义上来说甚至成为一种盲目的干涉外行指挥内行大至约束个框子小至书名作者的位置安放和颜色的指派不难设想在这种缺乏平等探讨的格局下要求所设计出来的封扉等的艺术效果将是什么样子当然提出这些问题并不是反对文字编辑对美编工作的参与而是希望各个出版社应在平时增加对书装艺术的知识的介绍和培训以指导编辑们以科学艺术的眼光来参与并审定书装设计方案使我们的出版物真正成为内容与形式美和谐统一的精神产品书评工作者本身的观念的局限是导致书评活动中忽视对书装艺术作出评价的一个重要性因素书评不同于文艺评论文艺评论是对文艺作品进行的学术界定当前书评文章中有种不良倾向书评朝文艺评论方向发展这就违背了书评的宗旨降低了书评本身的价值仅仅注意抓框架结构评内容主题而忽略了外在形式因素这种评论方式是不完整的也是不科学的所以书评人员应调整自己的书评观念把书的内容与形式因素放到同等重要的地位不否认因文而有主次之分进行综合评论唯其如此一篇完整而优秀的书评才能使出版者著作者编辑者和读者多方面的获益书装艺术既然是构成图书的有机组成部分那么缺少对书装艺术的评价就意味着书评工作的不完整图书是精神和物质内容和形式的综合体是人类社会的精神产品书装艺术是构成图书的重要组成部分正如高斯先生在出版审美论1994年版中所言图书的装帧设计不仅为图书穿上一套美观的外衣而且应该使图书的形式通过艺术构思艺术手法而和内容统一起来反映出图书内容的美反映出图书所蕴含的生命力的美一部图书的装帧设计其审美价值虽然只属于个体但个体的积累却可以造成一个历史时期的出版事业的审美价值这些论述足以说明装帧设计对于图书除了形式美方面有其重要意义和作用外更有在提高图书整体质量上的重要意义和重要作用装帧设计本身具有独特的艺术价值同时书装艺术也起着一种以艺术形式宣示图书内容的直观作用图书进入流通领域这种宣示既发挥了一种无可替代的引导读者的作用既给读者以美的鉴赏和启发又引发了读者阅读的兴趣和购买的动机这种社会价值超出了装帧设计艺术价值本身的范围而对整个图书市场起着不可忽视的调摄作用当今世界在图书出版领域已形成三种以书装艺术风格来促销的流派英国以庄重豪华大方为特征日本为首的东方文化风格以和谐含蕴抒情见长美国的现代派风格以感官刺激为特征这三者在图书营销上各有成效在读者圈内有着广泛而深远的影响哪一类图书应该采取何种风格所谓量体裁衣因书制宜是编辑工作者所应考虑的也是书评工作者进行评论的依据本文责任编辑韩忠良＊\n"
     ]
    }
   ],
   "source": [
    "txt=remove_text(df_all[\"result\"][0])\n",
    "print(txt)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# 数据清洗"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_all['result_pred'] =[remove_text(i) for i in df_all['result']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>result</th>\n",
       "      <th>result_pred</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】1-2340\\n【原文出处】中国图书评论\\n【原刊地名】沈阳\\n【原刊期号】1...</td>\n",
       "      <td>文献号1-2340原文出处中国图书评论原刊地名沈阳原刊期号199510原刊页号61-62分类...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】1-2681\\n【原文出处】武汉大学学报：哲社版\\n【原刊期号】199605\\...</td>\n",
       "      <td>文献号1-2681原文出处武汉大学学报哲社版原刊期号199605原刊页号136-137分类号...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】2-4116\\n【原文出处】云南学术探索\\n【原刊地名】昆明\\n【原刊期号】1...</td>\n",
       "      <td>文献号2-4116原文出处云南学术探索原刊地名昆明原刊期号199706原刊页号77～81分类...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】1-3036\\n【原文出处】文艺理论研究\\n【原刊地名】沪\\n【原刊期号】19...</td>\n",
       "      <td>文献号1-3036原文出处文艺理论研究原刊地名沪原刊期号199504原刊页号6-17分类号J...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】1-3066\\n【原文出处】文学评论\\n【原刊地名】京\\n【原刊期号】1995...</td>\n",
       "      <td>文献号1-3066原文出处文学评论原刊地名京原刊期号199506原刊页号91-102分类号J...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    label                                             result  \\\n",
       "0  C3-Art  【 文献号 】1-2340\\n【原文出处】中国图书评论\\n【原刊地名】沈阳\\n【原刊期号】1...   \n",
       "1  C3-Art  【 文献号 】1-2681\\n【原文出处】武汉大学学报：哲社版\\n【原刊期号】199605\\...   \n",
       "2  C3-Art  【 文献号 】2-4116\\n【原文出处】云南学术探索\\n【原刊地名】昆明\\n【原刊期号】1...   \n",
       "3  C3-Art  【 文献号 】1-3036\\n【原文出处】文艺理论研究\\n【原刊地名】沪\\n【原刊期号】19...   \n",
       "4  C3-Art  【 文献号 】1-3066\\n【原文出处】文学评论\\n【原刊地名】京\\n【原刊期号】1995...   \n",
       "\n",
       "                                         result_pred  \n",
       "0  文献号1-2340原文出处中国图书评论原刊地名沈阳原刊期号199510原刊页号61-62分类...  \n",
       "1  文献号1-2681原文出处武汉大学学报哲社版原刊期号199605原刊页号136-137分类号...  \n",
       "2  文献号2-4116原文出处云南学术探索原刊地名昆明原刊期号199706原刊页号77～81分类...  \n",
       "3  文献号1-3036原文出处文艺理论研究原刊地名沪原刊期号199504原刊页号6-17分类号J...  \n",
       "4  文献号1-3066原文出处文学评论原刊地名京原刊期号199506原刊页号91-102分类号J...  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_all.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Label标签化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>result</th>\n",
       "      <th>result_pred</th>\n",
       "      <th>label_pred</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】1-2340\\n【原文出处】中国图书评论\\n【原刊地名】沈阳\\n【原刊期号】1...</td>\n",
       "      <td>文献号1-2340原文出处中国图书评论原刊地名沈阳原刊期号199510原刊页号61-62分类...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】1-2681\\n【原文出处】武汉大学学报：哲社版\\n【原刊期号】199605\\...</td>\n",
       "      <td>文献号1-2681原文出处武汉大学学报哲社版原刊期号199605原刊页号136-137分类号...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】2-4116\\n【原文出处】云南学术探索\\n【原刊地名】昆明\\n【原刊期号】1...</td>\n",
       "      <td>文献号2-4116原文出处云南学术探索原刊地名昆明原刊期号199706原刊页号77～81分类...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】1-3036\\n【原文出处】文艺理论研究\\n【原刊地名】沪\\n【原刊期号】19...</td>\n",
       "      <td>文献号1-3036原文出处文艺理论研究原刊地名沪原刊期号199504原刊页号6-17分类号J...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】1-3066\\n【原文出处】文学评论\\n【原刊地名】京\\n【原刊期号】1995...</td>\n",
       "      <td>文献号1-3066原文出处文学评论原刊地名京原刊期号199506原刊页号91-102分类号J...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    label                                             result  \\\n",
       "0  C3-Art  【 文献号 】1-2340\\n【原文出处】中国图书评论\\n【原刊地名】沈阳\\n【原刊期号】1...   \n",
       "1  C3-Art  【 文献号 】1-2681\\n【原文出处】武汉大学学报：哲社版\\n【原刊期号】199605\\...   \n",
       "2  C3-Art  【 文献号 】2-4116\\n【原文出处】云南学术探索\\n【原刊地名】昆明\\n【原刊期号】1...   \n",
       "3  C3-Art  【 文献号 】1-3036\\n【原文出处】文艺理论研究\\n【原刊地名】沪\\n【原刊期号】19...   \n",
       "4  C3-Art  【 文献号 】1-3066\\n【原文出处】文学评论\\n【原刊地名】京\\n【原刊期号】1995...   \n",
       "\n",
       "                                         result_pred  label_pred  \n",
       "0  文献号1-2340原文出处中国图书评论原刊地名沈阳原刊期号199510原刊页号61-62分类...           0  \n",
       "1  文献号1-2681原文出处武汉大学学报哲社版原刊期号199605原刊页号136-137分类号...           0  \n",
       "2  文献号2-4116原文出处云南学术探索原刊地名昆明原刊期号199706原刊页号77～81分类...           0  \n",
       "3  文献号1-3036原文出处文艺理论研究原刊地名沪原刊期号199504原刊页号6-17分类号J...           0  \n",
       "4  文献号1-3066原文出处文学评论原刊地名京原刊期号199506原刊页号91-102分类号J...           0  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n",
    "x=[]\n",
    "encoder_x=LabelEncoder()\n",
    "x=encoder_x.fit_transform(df_all['label'])\n",
    "# print(lelist)\n",
    "df_all['label_pred']=x\n",
    "df_all.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "查看数据的不平衡性，以后要进行修改"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
      " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1\n",
      " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2\n",
      " 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2\n",
      " 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3\n",
      " 3 3 3 3 3 3 3 3 3 3 3]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n",
    "x=[]\n",
    "encoder_x=LabelEncoder()\n",
    "x=encoder_x.fit_transform(df_all['label'])\n",
    "# onehotencoder = onehotencoder(categorical_features = [0])\n",
    "# x=onehotencoder.fit_transform(x).toarray()\n",
    "# x=[:,1:]\n",
    "print(x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 进行停用词清洗"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# -*- encoding:utf-8 -*-\n",
    "import codecs\n",
    "\n",
    "import jieba\n",
    "\n",
    "import jieba.analyse\n",
    "\n",
    "import jieba.posseg\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "# import param\n",
    "\n",
    "# import util\n",
    "\n",
    "# import param\n",
    "\n",
    "# import util\n",
    "\n",
    "stopwords = {}\n",
    "for line in codecs.open('stopword.txt', 'r', 'utf-8'):\n",
    "    stopwords[line.rstrip()] = 1\n",
    "############################ 定义分词函数 ############################\n",
    "\n",
    "def split_word(text, stopwords):\n",
    "    word_list = jieba.cut(text)\n",
    "    start = True\n",
    "    result = ''\n",
    "    for word in word_list:\n",
    "        word = word.strip()\n",
    "        if word not in stopwords:\n",
    "            if start:\n",
    "                result = word\n",
    "                start = False\n",
    "            else:\n",
    "                result += word\n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "这个步骤最耗时"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 1.450 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>result</th>\n",
       "      <th>result_pred</th>\n",
       "      <th>label_pred</th>\n",
       "      <th>result_pred_test</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】1-2340\\n【原文出处】中国图书评论\\n【原刊地名】沈阳\\n【原刊期号】1...</td>\n",
       "      <td>文献号1-2340原文出处中国图书评论原刊地名沈阳原刊期号199510原刊页号61-62分类...</td>\n",
       "      <td>0</td>\n",
       "      <td>文献号2340原文出处图书评论原刊地名原刊期号199510原刊页号6162分类号Z1分类名图...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】1-2681\\n【原文出处】武汉大学学报：哲社版\\n【原刊期号】199605\\...</td>\n",
       "      <td>文献号1-2681原文出处武汉大学学报哲社版原刊期号199605原刊页号136-137分类号...</td>\n",
       "      <td>0</td>\n",
       "      <td>文献号2681原文出处武汉大学学报哲社版原刊期号199605原刊页号136137分类号Z1分...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】2-4116\\n【原文出处】云南学术探索\\n【原刊地名】昆明\\n【原刊期号】1...</td>\n",
       "      <td>文献号2-4116原文出处云南学术探索原刊地名昆明原刊期号199706原刊页号77～81分类...</td>\n",
       "      <td>0</td>\n",
       "      <td>文献号4116原文出处学术原刊地名原刊期号199706原刊页号7781分类号J6分类名音乐舞...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】1-3036\\n【原文出处】文艺理论研究\\n【原刊地名】沪\\n【原刊期号】19...</td>\n",
       "      <td>文献号1-3036原文出处文艺理论研究原刊地名沪原刊期号199504原刊页号6-17分类号J...</td>\n",
       "      <td>0</td>\n",
       "      <td>文献号3036原文出处文艺理论原刊地名沪原刊期号199504原刊页号17分类号J1分类名文艺...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>C3-Art</td>\n",
       "      <td>【 文献号 】1-3066\\n【原文出处】文学评论\\n【原刊地名】京\\n【原刊期号】1995...</td>\n",
       "      <td>文献号1-3066原文出处文学评论原刊地名京原刊期号199506原刊页号91-102分类号J...</td>\n",
       "      <td>0</td>\n",
       "      <td>文献号3066原文出处文学评论原刊地名京原刊期号199506原刊页号91102分类号J1分类...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    label                                             result  \\\n",
       "0  C3-Art  【 文献号 】1-2340\\n【原文出处】中国图书评论\\n【原刊地名】沈阳\\n【原刊期号】1...   \n",
       "1  C3-Art  【 文献号 】1-2681\\n【原文出处】武汉大学学报：哲社版\\n【原刊期号】199605\\...   \n",
       "2  C3-Art  【 文献号 】2-4116\\n【原文出处】云南学术探索\\n【原刊地名】昆明\\n【原刊期号】1...   \n",
       "3  C3-Art  【 文献号 】1-3036\\n【原文出处】文艺理论研究\\n【原刊地名】沪\\n【原刊期号】19...   \n",
       "4  C3-Art  【 文献号 】1-3066\\n【原文出处】文学评论\\n【原刊地名】京\\n【原刊期号】1995...   \n",
       "\n",
       "                                         result_pred  label_pred  \\\n",
       "0  文献号1-2340原文出处中国图书评论原刊地名沈阳原刊期号199510原刊页号61-62分类...           0   \n",
       "1  文献号1-2681原文出处武汉大学学报哲社版原刊期号199605原刊页号136-137分类号...           0   \n",
       "2  文献号2-4116原文出处云南学术探索原刊地名昆明原刊期号199706原刊页号77～81分类...           0   \n",
       "3  文献号1-3036原文出处文艺理论研究原刊地名沪原刊期号199504原刊页号6-17分类号J...           0   \n",
       "4  文献号1-3066原文出处文学评论原刊地名京原刊期号199506原刊页号91-102分类号J...           0   \n",
       "\n",
       "                                    result_pred_test  \n",
       "0  文献号2340原文出处图书评论原刊地名原刊期号199510原刊页号6162分类号Z1分类名图...  \n",
       "1  文献号2681原文出处武汉大学学报哲社版原刊期号199605原刊页号136137分类号Z1分...  \n",
       "2  文献号4116原文出处学术原刊地名原刊期号199706原刊页号7781分类号J6分类名音乐舞...  \n",
       "3  文献号3036原文出处文艺理论原刊地名沪原刊期号199504原刊页号17分类号J1分类名文艺...  \n",
       "4  文献号3066原文出处文学评论原刊地名京原刊期号199506原刊页号91102分类号J1分类...  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# pd_all['review_pred'] =[text_remove_punctuation(i) for i in pd_all['review']]\n",
    "df_all['result_pred_test'] =[split_word(i,stopwords) for i in df_all['result_pred']]\n",
    "df_all.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#将数据随机分为3:1\n",
    "feature=df_all['result_pred_test']\n",
    "target=df_all['label_pred']\n",
    "X_train, X_test, y_train, y_test = train_test_split(feature,target,test_size=0.3, random_state=33)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 数据查看\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import sklearn\n",
    "#机器学习算法模型\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier\n",
    "from sklearn.svm import SVC,LinearSVC\n",
    "from sklearn.naive_bayes import BernoulliNB,MultinomialNB,GaussianNB\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "# 特征提取\n",
    "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "#Pipeline 使用一系列 (key, value) 键值对来构建,其中 key 是你给这个步骤起的名字， value 是一个评估器对象:\n",
    "from sklearn.pipeline import Pipeline\n",
    "#准确率，精确率，召回率，f1\n",
    "from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report\n",
    "import joblib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 创建各类cv=CountVectorizer()和tf_idf工具\n",
    "cv=CountVectorizer()\n",
    "tdf=TfidfVectorizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#%%\n",
    "# Count vectoriser --> LogisticRegression()\n",
    "\n",
    "# 分类模型\n",
    "\n",
    "#1.逻辑回归\n",
    "lr=LogisticRegression()\n",
    "# 贝叶斯\n",
    "#2.多项式贝叶斯\n",
    "mb=MultinomialNB()\n",
    "#3.伯努利贝叶斯\n",
    "bb=BernoulliNB()\n",
    "# 支持向量机\n",
    "#4.支持向量机\n",
    "svc=SVC(kernel='rbf')\n",
    "svc1=SVC(kernel='linear')\n",
    "svc2=SVC(kernel='poly')\n",
    "svc3=SVC(kernel='sigmoid')\n",
    "\n",
    "#5.\n",
    "linearsvc=LinearSVC()\n",
    "#6.决策树\n",
    "dtc=DecisionTreeClassifier(random_state=22)\n",
    "#7.随机森林\n",
    "rfc=RandomForestClassifier(random_state=22)\n",
    "#9.KNN分类器\n",
    "knn=KNeighborsClassifier()\n",
    "\n",
    "modelList=[lr,mb,bb,svc,svc1,svc2,svc3,linearsvc,dtc,rfc,knn]\n",
    "\n",
    "#11个模型\n",
    "m_len=len(modelList)\n",
    "\n",
    "# 形成 9个模型 2个提取特征 5个指标  \n",
    "# 提取特征分类器\n",
    "textVectoriser=[cv,tdf]\n",
    "textv_len=len(textVectoriser)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "accuracy_score_list=[]\n",
    "precision_score_list=[]\n",
    "recall_score_list=[]\n",
    "f1_score_list=[]\n",
    "classification_report_list=[]\n",
    "modelClass=[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for i in range(m_len):\n",
    "    for j in range(textv_len):\n",
    "        pipeline = Pipeline([('vectorizer', textVectoriser[j]), ('classifier', modelList[i])])   \n",
    "        pipeline.fit(X_train,y_train)\n",
    "        pred=pipeline.predict(X_test)\n",
    "        accuracy_score_list.append(accuracy_score(y_test,pred))\n",
    "        #以为以下的有些只适合二分类\n",
    "#         precision_score_list.append(precision_score(y_test,pred))\n",
    "#         f1_score_list.append(f1_score(y_test,pred))\n",
    "#         recall_score_list.append(f1_score(y_test,pred))\n",
    "# #         classification_report_list.append(classification_report(y_test,pred))\n",
    "#         modelClass.append(pipeline)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "new_ticks = []\n",
    "name=[]\n",
    "modelNamelist=['逻辑回归','多项式贝叶斯','伯努利贝叶斯','RBF核SVM'\n",
    "               ,'线性核SVM','多项式SVM','sigmoidSVM'\n",
    "               ,'LinearSVC','决策树','随机森林','KNN']\n",
    "modelNamelist2=['lr','mb','bb','svc','svc1','svc2','svc3','l'+'\\n'+'svc','dtc','rfc','knn']\n",
    "textVectorNamelist = ['词袋','TDF']\n",
    "for i in range(m_len):\n",
    "    for j in range(textv_len):\n",
    "         new_ticks.append('\\n'.join([modelNamelist2[i],textVectorNamelist[j]]))\n",
    "         name.append(modelNamelist[i]+textVectorNamelist[j])\n",
    "         "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "22\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<matplotlib.figure.Figure at 0x24fcd3b6cc0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from pylab import *\n",
    "mpl.rcParams['font.sans-serif'] = ['SimHei']\n",
    "#plot根据列表绘制出有意义的图形，linewidth是图形线宽，可省略\n",
    "# plt.plot(input_values,squares,linewidth=5)\n",
    "plt.figure(figsize=(12,5),dpi=80)\n",
    "print(len(accuracy_score_list))\n",
    "plt.bar(range(len(accuracy_score_list)),accuracy_score_list,linewidth=5)\n",
    "#设置图标标题\n",
    "plt.title(\"不同管道模型准确率\",fontsize = 24)\n",
    "#设置坐标轴标签\n",
    "plt.xlabel(\"模型类型\",fontsize = 0.2)\n",
    "plt.ylabel(\"准确率\",fontsize = 0.5)\n",
    "#设置刻度标记的大小\n",
    "plt.tick_params(axis='both',labelsize = 14)\n",
    "#打开matplotlib查看器，并显示绘制图形\n",
    "#这是一半\n",
    "plt.xticks(range(22),new_ticks[:22])\n",
    "plt.show()\n",
    "# plt.xticks(new_ticks)\n",
    "# plt.xticks(new)\n",
    "\n",
    "\n",
    "# print(new_ticks[max_index_index])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "准确率最高的是 线性核SVM词袋\n",
      "max 0.8467741935483871\n"
     ]
    }
   ],
   "source": [
    "max_value=max(accuracy_score_list)\n",
    "index = accuracy_score_list.index(max_value)\n",
    "print('准确率最高的是',name[index])\n",
    "print('max',max_value)\n",
    "final_index=index\n",
    "final_name=name[index]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
